# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='...', project_access_token='...')


# Define required imports
import pandas as pd
from pandas import read_excel
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import seaborn as sns
!pip install cufflinks
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import clear_output
clear_output()


# Define get data file function
def get_file_handle(fname):
    # Project data path for the raw data file
    data_path = project.get_file(fname)
    data_path.seek(0)
    return data_path


# define filename
DATA_PATH = 'LEXICON_UG.txt'

# Using pandas to read the data 
data_path = get_file_handle(DATA_PATH)
unigrams = pd.read_csv(data_path, sep=" ")
unigrams.head()


# define filename
DATA_PATH = 'LEXICON_BG.txt'

# Using pandas to read the data 
data_path = get_file_handle(DATA_PATH)
bigrams = pd.read_csv(data_path, sep=" ")
bigrams.head()


# Add sentiment column
unigrams['sentiment'] = np.where(unigrams['SENTIMENT_SCORE'] > 0, 1, 0)  # 1 is positive, 0 is negative
unigrams.head()


# The distribution of review sentiment polarity score
unigrams['SENTIMENT_SCORE'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')
# The sentiment polarity score is similar to a bell curve, center at 0, means half of them are positive, half are negative.


# Get the string length of each unigram word
unigrams['uni_len'] = [len(str(i)) for i in unigrams['UNIGRAM']]
# Plot the unigram length distribution
unigrams['uni_len'].iplot(
    kind='hist',
    xTitle='unigram length',
    linecolor='black',
    yTitle='count',
    title='Unigram Text Length Distribution')


unigrams['first_letter'] = unigrams.UNIGRAM.str[0]
unigrams.head()


# get number of words under each alphabet
group_data = unigrams.groupby(['first_letter', 'sentiment'])
group_data.count()


plt.figure(figsize=(20,20))
sns.set(style="darkgrid")
ax = sns.countplot(x="first_letter", data=unigrams)

plt.title('Data Distribution')

for p in ax.patches:
        total_count = str(p.get_height())
        x=p.get_x() + p.get_width() - 0.75
        y=p.get_y() +p.get_height()
        ax.annotate(total_count, (x, y))


# Add sentiment column
bigrams['sentiment'] = np.where(bigrams['SENTIMENT_SCORE'] > 0, 1, 0)  # 1 is positive, 0 is negative
bigrams.head()


# The distribution of review sentiment polarity score
bigrams['SENTIMENT_SCORE'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')


df = pd.DataFrame(bigrams.groupby(['POS_TAGS', 'sentiment']).size().reset_index())
df.head()


import plotly.express as px

fig = px.bar(df, x="POS_TAGS", y=0, color="sentiment", title="Long-Form Input")
fig.show()


bigrams.groupby('POS_TAGS').count()['SENTIMENT_SCORE'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
                                                           title='Pos Tags Count', xTitle='Pos Tag')


# get number of words under each POS Tag
group_data = bigrams.groupby(['POS_TAGS','sentiment'])
group_data.count()


# add first letter and sentiment columns
bigrams['first_letter'] = bigrams.BIGRAM.str[0]
# get number of words under each alphabet
group_data = bigrams.groupby(['first_letter','sentiment'])
group_data.count()


plt.figure(figsize=(20,20))
sns.set(style="darkgrid")
ax = sns.countplot(x="first_letter", data=bigrams)

plt.title('Data Distribution')

for p in ax.patches:
        total_count = str(p.get_height())
        x=p.get_x() + p.get_width() - 0.75
        y=p.get_y() +p.get_height()
        ax.annotate(total_count, (x, y))


project.save_data("unigrams.csv", unigrams.to_csv(float_format='%g'), overwrite=True)
project.save_data("bigrams.csv", bigrams.to_csv(float_format='%g'), overwrite=True)

	UNIGRAM	SENTIMENT_SCORE
0	aa	0.019674
1	aaa	0.032775
2	aaas	0.074593
3	aachen	0.011926
4	aah	0.118070

	BIGRAM	POS_TAGS	SENTIMENT_SCORE
0	abalone-divers	NN-NNS	-0.090230
1	abandoned-animals	VBN-NNS	-0.089895
2	abandoned-apartment	VBN-NN	-0.126907
3	abandoned-attempts	VBN-NNS	-0.053709
4	abandoned-babies	VBN-NNS	-0.074742

	UNIGRAM	SENTIMENT_SCORE	sentiment
0	aa	0.019674	1
1	aaa	0.032775	1
2	aaas	0.074593	1
3	aachen	0.011926	1
4	aah	0.118070	1

	UNIGRAM	SENTIMENT_SCORE	sentiment	uni_len	first_letter
0	aa	0.019674	1	2	a
1	aaa	0.032775	1	3	a
2	aaas	0.074593	1	4	a
3	aachen	0.011926	1	6	a
4	aah	0.118070	1	3	a

		UNIGRAM	SENTIMENT_SCORE	uni_len
first_letter	sentiment
a	0	2022	2022	2022
a	1	2041	2041	2041
b	0	2569	2569	2569
b	1	1409	1409	1409
c	0	3595	3595	3595
c	1	2787	2787	2787
d	0	2645	2645	2645
d	1	1261	1261	1261
e	0	1293	1293	1293
e	1	1413	1413	1413
f	0	1637	1637	1637
f	1	1062	1062	1062
g	0	1242	1242	1242
g	1	973	973	973
h	0	1531	1531	1531
h	1	995	995	995
i	0	1594	1594	1594
i	1	1013	1013	1013
j	0	400	400	400
j	1	230	230	230
k	0	435	435	435
k	1	301	301	301
l	0	1280	1280	1280
l	1	931	931	931
m	0	2205	2205	2205
m	1	1493	1493	1493
n	0	810	810	810
n	1	592	592	592
o	0	962	962	962
o	1	628	628	628
p	0	3001	3001	3001
p	1	2094	2094	2094
q	0	168	168	168
q	1	130	130	130
r	0	2023	2023	2023
r	1	1730	1730	1730
s	0	4768	4768	4768
s	1	2714	2714	2714
t	0	2035	2035	2035
t	1	1351	1351	1351
u	0	1121	1121	1121
u	1	446	446	446
v	0	539	539	539
v	1	471	471	471
w	0	1123	1123	1123
w	1	552	552	552
x	0	28	28	28
x	1	28	28	28
y	0	116	116	116
y	1	91	91	91
z	0	117	117	117
z	1	61	61	61

Load and Visualize IBM Debater® Sentiment Composition Lexicons¶

Table of Contents¶

0. Prerequisites¶

Insert a project token¶

Import required modules¶

1. Load Data ¶

1.1 About ¶

1.2 Read Data ¶

LEXICON_UG.txt:¶

LEXICON_BG.txt:¶

2. Data Visualization ¶

2.1 Unigrams ¶

2.2 Bigrams ¶

3. Save the Cleaned Data ¶

Next steps¶

Citation¶

Authors¶

		BIGRAM	SENTIMENT_SCORE
POS_TAGS	sentiment
JJ-JJ	0	1698	1698
JJ-JJ	1	2000	2000
JJ-JJR	0	5	5
JJ-JJR	1	3	3
JJ-JJS	0	9	9
...	...	...	...
VBP-NNS	1	151	151
VBZ-NN	0	326	326
VBZ-NN	1	609	609
VBZ-NNS	0	179	179
VBZ-NNS	1	743	743

		BIGRAM	POS_TAGS	SENTIMENT_SCORE
first_letter	sentiment
a	0	6784	6784	6784
a	1	7192	7192	7192
b	0	7012	7012	7012
b	1	5127	5127	5127
c	0	12077	12077	12077
c	1	11856	11856	11856
d	0	9196	9196	9196
d	1	6117	6117	6117
e	0	6377	6377	6377
e	1	9498	9498	9498
f	0	7880	7880	7880
f	1	5584	5584	5584
g	0	3486	3486	3486
g	1	4278	4278	4278
h	0	4982	4982	4982
h	1	3664	3664	3664
i	0	6628	6628	6628
i	1	6920	6920	6920
j	0	673	673	673
j	1	349	349	349
k	0	532	532	532
k	1	486	486	486
l	0	5206	5206	5206
l	1	4531	4531	4531
m	0	7747	7747	7747
m	1	6936	6936	6936
n	0	2711	2711	2711
n	1	2348	2348	2348
o	0	3872	3872	3872
o	1	3459	3459	3459
p	0	11531	11531	11531
p	1	9215	9215	9215
q	0	486	486	486
q	1	654	654	654
r	0	7508	7508	7508
r	1	7032	7032	7032
s	0	16508	16508	16508
s	1	13062	13062	13062
t	0	6951	6951	6951
t	1	5667	5667	5667
u	0	5066	5066	5066
u	1	2353	2353	2353
v	0	2427	2427	2427
v	1	2606	2606	2606
w	0	4293	4293	4293
w	1	2677	2677	2677
x	0	7	7	7
x	1	6	6	6
y	0	268	268	268
y	1	620	620	620
z	0	87	87	87
z	1	23	23	23